*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*	This program cleans the raw enrollment file covering school years
*	2011/2012 through 2018/2019.
*	----------------------------------------------------------------------------
*	IN: 	OC Data_2011_2016.csv (raw enrollment file)
*			OC Data_2018_2019.csv (raw enrollment file)
*			dps_school_classification_all_years.dta (school sector classifications)
*			dps_schools_classification_year_2016.dta (more recent school sector classifications)
*
*	OUT: 	enrollment_long.dta (unique on student-year; includes students who repeat grades)
*		 	enrollment_wide.dta (unique on student-grade)
*	----------------------------------------------------------------------------

	clear all
	set more off

*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

	//october count
	insheet using "${rawdata}enrollment/OC Data_2011_2016.csv", clear
	tempfile first
	save "`first'"

	import excel using "${rawdata}enrollment/OC Data_2011_2018.xlsx", sheet("2017_2018") clear first case(lower)

	replace resschoolnum = "" if resschoolnum == "NULL"
	destring resschoolnum, replace

	append using "`first'"

***	clean

	//spring year convention
	replace year = year + 1
	label variable year "Spring year of academic year enrolled"

	//rename raw vars
	rename maskedid 		stu
	rename schoolnum 		sch
	rename schoolname 		schname
	rename resschoolnum 	resschnum
	rename raceethnicity 	race
	rename bilingual 		bilingual_raw
	rename esl 				esl_raw
	rename sped 			sped_raw

	//some schnums have multiple schnames that are all super similar. Keep latest.
	gsort sch -year
	by sch: gen new_name = schname if _n == 1
	by sch: replace new_name = new_name[_n - 1] if _n > 1
	replace schname = new_name
	drop new_name

**	demographics

*	race
	gen asian 		= inlist(race,"Asian","Asian ")
	gen black 		= race == "Black"
	gen hispanic	= race == "Hispanic"
	gen white		= race == "White"

*	language
	gen esl 		= inlist(esl_raw, "Yes - ESL Program" , "Redesignated ESL Program (Monitored Year 1)",  "Redesignated ESL Program (Monitored Year 2)" )

*	SPED
	gen sped = sped_raw == "Yes"

*	free or reduced lunch
	gen f_lunch		 	= frl == "Free"
	gen r_lunch	 		= frl == "Reduced"
	gen fr_lunch 		= f_lunch | r_lunch

*	gender
	gen female = gender == "Female"

*	repeaters
	duplicates tag stu grade , gen(x)
	gen repeat = x > 0
	drop x

**	label
	label var f_lunch 	"Free lunch"
	label var r_lunch 	"Reduced lunch"
	label var esl 		"English as a Second Language"
	label var stu 		"Student ID"
	label var sch 		"School code"
	label var schname 	"School name"

***  sectors

	local sectors cmo non_cmo innovation alternative magnet contract traditional

	gen schoolnum=sch
	merge m:1 schoolnum grade using "${rawdata}schools/dps_school_classification_all_years.dta", keep(1 3) gen(all_sect_merge)

	//flag schools still missing sectors
	egen sector_check = rowmax(`sectors')
	replace sector_check = 0 if sector_check == .
	gen missing = (sector_check == 0)

	//some schools aren't in the classification for the first two years but are in the later files
	preserve
		use "${rawdata}schools/dps_schools_classification_year_2016.dta", clear
		gen missing = 1
		tempfile sectors2016
		sa `sectors2016'
	restore

	//merge unmatched schools to newer file
	merge m:1 schoolnum grade missing using `sectors2016', gen(miss_sect_merge) keep(1 3 4 5) update

	//this is a charter school but we cant find any info on it for the 2014-2015 and it closed in that year
	replace charter = 1 if sch == 512 & year == 2015

	//set missings to zero
	foreach var in charter `sectors' {
		replace `var' = 0 if `var' == .
	}

	//update sector check
	drop sector_check
	egen sector_check = rowmax(`sectors')
	replace sector_check = 0 if sector_check == .

	//designate as traditional
	replace traditional = 1 if sector_check == 0

***	save

	duplicates drop
	isid stu year

	keep stu year grade resschnum sch schname race asian black hispanic white  ///
		esl sped f_lunch r_lunch fr_lunch female repeat ///
		charter innovation alternative magnet contract traditional cmo non_cmo kipp dsst strive

	qui compress

*	long
	save "${cleandata}enrollment_long.dta", replace
